# Immigration and the top 1%

# proportion of foreigners in different industries, age, sex

## pick year
y <- 2017

# # # #

# file
file_name <- paste('datalab_data_folder/SA_PAYE', y, '.fst', sep = '')

# upload
data_top <- fst::read_fst(file_name)
data_top <- as_tibble(data_top)

#recode missings to zero
data_top <- data_top %>% 
  mutate(weight = as.numeric(weight),
         weight = case_when(is.na(weight) ~ 0, 
                            TRUE ~ weight))



# upload industry codes
file_name <- paste('datalab_data_folder/SA', y, '.csv.gz', sep = "")

# industry
indus <- read_csv(file_name, col_types = cols_only(anon_utr = col_double(),
                                                   nino_anon = col_character(),
                                                   id = col_character(),
                                                   sic = col_double()))


# # # #

#Weighted 
data_top <- data_top %>% mutate(w_rank = cumsum(weight))
data_top <- data_top %>% filter(w_rank < psize)

# find utrs
utr_list <- data_top %>%
  select(anon_utr) %>%
  as_vector()

# find ninos
nino_list <- data_top %>%
  select(nino_anon) %>%
  as_vector()

# find ids
id_list <- data_top %>% 
  select(id) %>% 
  as_vector()

# unique
utr_list <- utr_list %>% unique()
nino_list <- nino_list %>% unique()
id_list <- id_list %>% unique()

# eliminate NAs
utr_list <- utr_list[which(!is.na(utr_list))]
nino_list <- nino_list[which(!is.na(nino_list))]
id_list <- id_list[which(!is.na(id_list))]

# subset industry
small_indus <- indus %>%
  filter(anon_utr %in% utr_list | nino_anon %in% nino_list | id %in% id_list)


# # # #

## merge industry by UTR and NINO
data <- left_join(data_top, small_indus, by = "id") #c('anon_utr', 'nino_anon'))

data <- data %>% as_tibble()


# # # #

#Total income of top 1%
ti_all <- sum(data$ti*data$weight)

## average income by industry for migrants and natives
tab <- data %>%
  mutate_at(vars(tei, tii, emp_inc, selfemp_inc), function(x) {as.numeric(x)}) %>%
  group_by(sic, migrant_comb) %>%
  summarise(n_unwt = n(),
            n_wt = sum(weight),
            ti = mean(ti, wt = weight),
            tei = mean(tei, wt = weight),
            tii = mean(tii, wt = weight),
            emp_inc = mean(emp_inc, wt = weight),
            self_emp_inc = mean(selfemp_inc, wt = weight),
            sum_ti = sum(ti, wt = weight),
            sum_tei = sum(tei, wt = weight),
            ti_ind_sh = sum_ti/ti_all*100) 

#Keep total income
print(paste0("Total income of top 1%:", ti_all))

# suppress
ind_cod <- tab %>%
  filter(n_unwt < 100) %>%
  select(sic) %>%
  as_vector() %>%
  unique()

# table
tab <- tab %>%
  filter(!sic %in% ind_cod)

# remove NAs as these don't account for suppressed industries
tab <- tab %>%
  filter(!is.na(sic))

# write csv
write_csv(tab, 'output/industry.csv')


# # # # #

## AGE

# numeric
data <- data %>%
  mutate(tyob = as.numeric(tyob))

# age distribution of migrants and natives
dgraph <- data %>%
  mutate(age = y - tyob) %>%
  mutate(age_group = cut(age, breaks = c(0, seq(20, 24, 2), 25:90), labels = F, include.lowest = T)) %>%
  group_by(age_group) %>%
  mutate(age_group_max = max(age)) %>%
  group_by(age_group_max, migrant_comb) %>%
  summarise(n = n()) %>%
  filter(!is.na(age_group_max))

# plot
dgraph %>%
  ggplot(., aes(age_group_max, n, fill = migrant_comb)) +
  geom_col(position = 'dodge') +
  theme_minimal() +
  scale_fill_manual(values = c('navy', 'maroon'))

# save
ggsave(filename = 'age.pdf', path = 'output/',
       dpi = 'retina', width = 16, height = 9, units = 'cm')


# write csv
write_csv(dgraph, 'output/age.csv')



# # # # 

## SEX

## calculate distribution
dgraph <- data %>%
  group_by(migrant_comb, sex) %>%
  summarise(n = n())

# plot
dgraph %>%
  ggplot(., aes(migrant_comb, n, fill = as.factor(sex))) +
  geom_col(position = 'dodge') +
  theme_minimal() +
  scale_fill_manual(values = c('navy', 'maroon'))

# save
ggsave(filename = 'sex_distribution.pdf', path = 'output/',
       dpi = 'retina', width = 16, height = 9, units = 'cm')

# write csv
write_csv(dgraph, 'output/sex_distribution.csv')
